#R libraries
library(knitr)
library(yaml)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
require(graphics)
library(tidyverse)
## -- Attaching packages -------------------------------------------------- tidyverse 1.3.0 --
## v tibble 3.0.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## v purrr 0.3.4
## -- Conflicts ----------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(ggthemes)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(maps)
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
library(stringr)
library(stringi)
library(mapproj)
library(RCurl)
##
## Attaching package: 'RCurl'
## The following object is masked from 'package:tidyr':
##
## complete
library(readr)
library(rio)
##
## Attaching package: 'rio'
## The following object is masked from 'package:plotly':
##
## export
library(naniar)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
library(grid)
library(mice)
## Warning: package 'mice' was built under R version 4.0.3
##
## Attaching package: 'mice'
## The following object is masked from 'package:RCurl':
##
## complete
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(class)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(e1071)
library(datasets)
#Beer and Breweries data input from CSV files
# This reads in the Beers data from select folder file Beers.csv.
Beerdata <- read.csv("https://raw.githubusercontent.com/VenkataVanga/MSDS-6306/main/Beers.csv",
header = T,sep = ",",na.strings = "NA",fill = TRUE)
head(Beerdata)
## Name Beer_ID ABV IBU Brewery_id
## 1 Pub Beer 1436 0.050 NA 409
## 2 Devil's Cup 2265 0.066 NA 178
## 3 Rise of the Phoenix 2264 0.071 NA 178
## 4 Sinister 2263 0.090 NA 178
## 5 Sex and Candy 2262 0.075 NA 178
## 6 Black Exodus 2261 0.077 NA 178
## Style Ounces
## 1 American Pale Lager 12
## 2 American Pale Ale (APA) 12
## 3 American IPA 12
## 4 American Double / Imperial IPA 12
## 5 American IPA 12
## 6 Oatmeal Stout 12
#Number of rows in Beer data
nrow(Beerdata)
## [1] 2410
# This reads in the Beers data from select folder file Breweries.csv.
Breweriesdata <- read.csv("https://raw.githubusercontent.com/VenkataVanga/MSDS-6306/main/Breweries.csv",
header = T,sep = ",",na.strings = "NA", fill = TRUE)
head(Breweriesdata)
## Brew_ID Name City State
## 1 1 NorthGate Brewing Minneapolis MN
## 2 2 Against the Grain Brewery Louisville KY
## 3 3 Jack's Abby Craft Lagers Framingham MA
## 4 4 Mike Hess Brewing Company San Diego CA
## 5 5 Fort Point Beer Company San Francisco CA
## 6 6 COAST Brewing Company Charleston SC
#Number of rows in Breweries data
nrow(Breweriesdata)
## [1] 558
#Generating geographic plots from breweries data (following chunk generate the final plot)
#makes a data frame with State name and abbreviation.
lookup = data.frame(abb = state.abb, State = state.name)
dc <- c("DC", "District of Columbia")
lookup <- rbind(lookup,dc)
# Change Column Name State to abb (abbreviation)
colnames(Breweriesdata)[4] = "abb"
# Removes left space in state abb data taken from Breweries CSV
Breweriesdata$abb <- trimws(Breweriesdata$abb,which = c("left"))
# make one dataset with state names and abb
Brewdata <- merge(Breweriesdata,lookup,"abb")
Brewcount <- count(Brewdata,State,abb) #count up the occurrence of each state
colnames(Brewcount)[3] = "Breweries_Count" #change "n" to "Breweries_Count"
# added state name also changed to lower case
Brewcount$region <- tolower(Brewcount$State)
Brewcount2 = Brewcount[-1] #removed first column from Brew count data state name
states <- map_data("state") #contains data for states excluding Alaska, Hawaii
#Added Alaska to states as the states data does not include Alaska (Hawaii not in data so did not include in below data)
Alaska <- read.csv("https://raw.githubusercontent.com/VenkataVanga/MSDS-6306/main/Alaska.csv",header = T,sep = ",",na.strings = "NA",
fill = TRUE);
states <- rbind(states,Alaska)
#map.df is data frame containing longitude and latitude to form state and breweries count by state
map.df <- merge(states,Brewcount2, by="region", all.x=T)
map.df <- map.df[order(map.df$order),]
#Breweries count by state using gradient graphics
plot <- map.df %>% ggplot(aes(x=long, y=lat, group = group)) +
geom_polygon(aes(fill = Breweries_Count)) + geom_path() +
scale_fill_gradientn(colours=rev(topo.colors(10)),na.value="grey90",
breaks=c(0,5,10,15,20,25,30,35,40,45,50))+
ggtitle("Breweries Count by State") + coord_map()
#state center longitude and latitude (read data from csv file)
st_center <- read.csv("https://raw.githubusercontent.com/VenkataVanga/MSDS-6306/main/long_lat_statecenter.csv",header = T,sep = ",",na.strings = "NA",
fill = TRUE);
# state names changed to lower case
st_center$region <- tolower(st_center$region)
#map.df1 merges state center with Breweries count data
map.df1 <- merge(st_center,Brewcount2, by="region", all.x = T)
# Adding state names and Breweries count data by state
Nplot <- plot + geom_point(data=map.df1, aes(x=long, y=lat,
size = Breweries_Count, group = region),
show.legend = FALSE) +
geom_text(aes( x= long, y = lat,label = abb.x, group = region),
data = map.df1, vjust=-1.2) +
geom_text(aes(x = long, y = lat, label = Breweries_Count, group = region),
data = map.df1,vjust=1.8)
#Code below adds scaled legend to the Breweries count by state
cplot <- Nplot + theme(legend.background = element_rect(fill="gray90",
size=10, linetype="dotted"),
legend.key.height = unit(4,"lines")) +
theme(plot.title = element_text(size=14, face= "bold", colour= "black"))
#Final plot with state names and Breweries count by state
cplot #plot best viewed in scaled full screen zoom 1920px x 1080px (21" or higher dim screen)
The final plot shows that ‘Colorado’ has the highest (47)number of breweries followed by ‘California (39)’, ‘Michigan(32)’ and ‘Oregon(29)’ being the next three successors.
#Merging beer data and breweries data
colnames(Beerdata)[5] = "Brew_ID" #Changing columun name to combine both beer data and breweries data
Totaldata <- merge(Beerdata,Breweriesdata, by="Brew_ID", all.x = T) %>% na_if("")
colnames(Totaldata)[2] = "Beer_Name" #Changing the column name to Beer name
colnames(Totaldata)[8] = "Brewery_Name" # Changing the column name to Brewery name
colnames(Totaldata)[10] = "State" # Changing the column name back to State
#First 6 observations after merging beer data and breweries data
head(Totaldata,6)
## Brew_ID Beer_Name Beer_ID ABV IBU Style
## 1 1 Get Together 2692 0.045 50 American IPA
## 2 1 Maggie's Leap 2691 0.049 26 Milk / Sweet Stout
## 3 1 Wall's End 2690 0.048 19 English Brown Ale
## 4 1 Pumpion 2689 0.060 38 Pumpkin Ale
## 5 1 Stronghold 2688 0.060 25 American Porter
## 6 1 Parapet ESB 2687 0.056 47 Extra Special / Strong Bitter (ESB)
## Ounces Brewery_Name City State
## 1 16 NorthGate Brewing Minneapolis MN
## 2 16 NorthGate Brewing Minneapolis MN
## 3 16 NorthGate Brewing Minneapolis MN
## 4 16 NorthGate Brewing Minneapolis MN
## 5 16 NorthGate Brewing Minneapolis MN
## 6 16 NorthGate Brewing Minneapolis MN
#Last 6 observations after merging beer data and breweries data
tail(Totaldata,6)
## Brew_ID Beer_Name Beer_ID ABV IBU
## 2405 556 Pilsner Ukiah 98 0.055 NA
## 2406 557 Heinnieweisse Weissebier 52 0.049 NA
## 2407 557 Snapperhead IPA 51 0.068 NA
## 2408 557 Moo Thunder Stout 50 0.049 NA
## 2409 557 Porkslap Pale Ale 49 0.043 NA
## 2410 558 Urban Wilderness Pale Ale 30 0.049 NA
## Style Ounces Brewery_Name City
## 2405 German Pilsener 12 Ukiah Brewing Company Ukiah
## 2406 Hefeweizen 12 Butternuts Beer and Ale Garrattsville
## 2407 American IPA 12 Butternuts Beer and Ale Garrattsville
## 2408 Milk / Sweet Stout 12 Butternuts Beer and Ale Garrattsville
## 2409 American Pale Ale (APA) 12 Butternuts Beer and Ale Garrattsville
## 2410 English Pale Ale 12 Sleeping Lady Brewing Company Anchorage
## State
## 2405 CA
## 2406 NY
## 2407 NY
## 2408 NY
## 2409 NY
## 2410 AK
#Missing Values in ABV/IBU columns
table(is.na(Totaldata$ABV)) # True value represents NA values in the column ABV in original data
##
## FALSE TRUE
## 2348 62
table(is.na(Totaldata$IBU)) # True value represents NA values in the column IBU in original data
##
## FALSE TRUE
## 1405 1005
table(is.na(Totaldata$Style)) # True value represents NA values in the column Style in original data
##
## FALSE TRUE
## 2405 5
gg_miss_var(Totaldata)+ ggtitle("Missing data values by Category")+scale_y_log10() + theme(
# AXIS LABLES APPEARANCE
plot.title = element_text(size=14, face= "bold", colour= "black" ),
axis.title.x = element_text(size=12, face="bold", colour = "black"),
axis.title.y = element_text(size=12, face="bold", colour = "black"),
axis.text.x = element_text(size=12, face="bold", colour = "black"),
axis.text.y = element_text(size=12, face="bold", colour = "black"),
strip.text.x = element_text(size = 10, face="bold", colour = "black" ),
strip.text.y = element_text(size = 10, face="bold", colour = "black"),
)
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous y-axis
The plot and the values in tables above provide the missing data values in ABV (65), IBU (1005) and Style (5) columns from originally considered beer data.
#Addressing the missing values
#MICE - Multivariate Imputation by Chained Equations method is used to impute the missing variables in ABV and IBU columns
imp <- mice(Totaldata, method='norm.predict', m=5)
##
## iter imp variable
## 1 1 ABV IBU
## 1 2 ABV IBU
## 1 3 ABV IBU
## 1 4 ABV IBU
## 1 5 ABV IBU
## 2 1 ABV IBU
## 2 2 ABV IBU
## 2 3 ABV IBU
## 2 4 ABV IBU
## 2 5 ABV IBU
## 3 1 ABV IBU
## 3 2 ABV IBU
## 3 3 ABV IBU
## 3 4 ABV IBU
## 3 5 ABV IBU
## 4 1 ABV IBU
## 4 2 ABV IBU
## 4 3 ABV IBU
## 4 4 ABV IBU
## 4 5 ABV IBU
## 5 1 ABV IBU
## 5 2 ABV IBU
## 5 3 ABV IBU
## 5 4 ABV IBU
## 5 5 ABV IBU
## Warning: Number of logged events: 5
Totaldata_imp=complete(imp)
gg_miss_var(Totaldata_imp) # shows only style as missing data
table(is.na(Totaldata_imp$Style)) # True value represents NA values in the column Style in original data
##
## FALSE TRUE
## 2405 5
Totaldata_imp[c(which(is.na(Totaldata_imp$Style))),]
## Brew_ID Beer_Name Beer_ID ABV IBU Style
## 227 30 Special Release 2210 0.06286884 46.13160 <NA>
## 455 67 OktoberFiesta 2527 0.05300000 27.00000 <NA>
## 946 161 Kilt Lifter Scottish-Style Ale 1635 0.06000000 21.00000 <NA>
## 992 167 The CROWLERâ„¢ 1796 0.07755227 62.09427 <NA>
## 993 167 CAN'D AID Foundation 1790 0.05870227 41.53910 <NA>
## Ounces Brewery_Name City State
## 227 16 Cedar Creek Brewery Seven Points TX
## 455 12 Freetail Brewing Company San Antonio TX
## 946 12 Four Peaks Brewing Company Tempe AZ
## 992 32 Oskar Blues Brewery Longmont CO
## 993 12 Oskar Blues Brewery Longmont CO
#Internet search for all the missing styles above
Totaldata_imp[227,"Style"] = "English India Pale Ale (IPA)" #"https://irp-cdn.multiscreensite.com/b5112d09/files/uploaded/Beer%20Menu_2up%20%281%29-1.png"
Totaldata_imp[455,"Style"] = "Vienna Lager" #https://www.craftbeer.com/styles/german-style-marzen-oktoberfest#:~:text=A%20beer%20rich%20in%20malt,similar%20to%20the%20Vienna%20lager.&text=Originating%20in%20Germany%2C%20this%20style,or%20lagered%2C%20throughout%20the%20summer.
Totaldata_imp[946,"Style"] = "Scotch Ale / Wee Heavy" #based on name
Totaldata_imp[992,"Style"] = "German Pilsener" #https://www.oskarblues.com/beers/
Totaldata_imp[993,"Style"] = "German Pilsener" #https://www.oskarblues.com/beers/
Totaldata_imp$Ounces <- as.factor(Totaldata_imp$Ounces)
#Summary count of number of ABV values by categorized Ounces of beer
s <- Totaldata_imp %>% group_by(Ounces)%>% summarize(ABV,count=n())
## `summarise()` regrouping output by 'Ounces' (override with `.groups` argument)
# Bar plot of ABV v. Ounces to see how much data is available
Totaldata_imp %>% ggplot(aes(x = Ounces, y = s$count[1], fill=Ounces)) + geom_col() +
ggtitle("Alcohol By Volume (ABV) data count by Ounce Category") +
labs(y="Alcohol By Volume (ABV)") +
geom_text(aes(Ounces, s$count+50, label = s$count, fill = NULL), data = s)
#Median for ABV and IBU content for each state (final plot given in below chunk)
#Evaluating median of ABV and IBU (all NA values removed from data)
Median_ABV <- Totaldata_imp %>% arrange(State) %>% group_by(State) %>%
summarize(Median_ABV = median(ABV, na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
Median_IBU <- Totaldata_imp %>% arrange(State) %>% group_by(State) %>%
summarize(Median_IBU = median(IBU, na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
Median_ABV <- data.frame(Median_ABV)
Median_IBU <- data.frame(Median_IBU)
Median_ABV$Cat <- "Median_ABV"
Median_IBU$Cat <- "Median_IBU"
#Bar plot for Median ABV/IBU content for each state
Medianplot_ABV_IBU <- Median_ABV %>% ggplot(mapping = aes(x,y)) +
geom_bar(aes(x = State, y = Median_ABV, fill = State), group = Median_ABV$State,
stat = 'identity', show.legend = FALSE) +
ggtitle('Median ABV Content by State') + geom_text(aes(State, Median_ABV+0.005,
label = percent(Median_ABV, accuracy = 0.01),
fill = NULL), data = Median_ABV, angle = 90) +
geom_bar(data=Median_IBU, aes(x = State, y = Median_IBU, fill = State), stat = 'identity',
show.legend = FALSE) +
geom_text(aes(State, Median_IBU+2, label = round(Median_IBU,digits = 0), fill = NULL), data = Median_IBU) +
ggtitle('Median ABV and Median IBU Content by State') + facet_grid(Cat~., scale = "free_y",
labeller = label_parsed, switch = "y") + labs(x="State") +
theme(axis.title.y = element_blank(), strip.placement = "outside",axis.title.x = element_text(size=12, face="bold", colour = "black"),
axis.text.x = element_text(size=12, face="bold", colour = "black"),
axis.text.y = element_text(size=12, face="bold", colour = "black"),
strip.text.x = element_text(size = 10, face="bold", colour = "black" ),
strip.text.y = element_text(size = 10, face="bold", colour = "black"),
)
Medianplot_ABV_IBU
#plot best viewed in scaled full screen zoom 1920px x 1080px (21" or higher dim screen)
From the median ABV and median IBU plot above it can be observed that ‘District of Columbia’ and ‘Kentucky’ have the highest median ABV values shown in percentage. ‘Maine’ and ‘West Virginia’ have the highest median IBU values.
#collecting max data
Max_ABV <- Totaldata_imp %>% arrange(State) %>% group_by(State) %>%
summarize(Max_ABV = max(ABV, na.rm=TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
Max_IBU <- Totaldata_imp %>% arrange(State) %>% group_by(State) %>%
summarize(Max_IBU = max(IBU, na.rm=TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
Max_ABV$Cat <- "Max_ABV"
Max_IBU$Cat <- "Max_IBU"
#Bar plot for max ABV content for each state
Maxplot_ABV_IBU <- Max_ABV %>% ggplot(mapping = aes(x,y)) +
geom_bar(aes(x = State, y = Max_ABV, fill = State), group = Max_ABV$State,
stat = 'identity', show.legend = FALSE) +
ggtitle('Max ABV Content by State') + geom_text(aes(State, Max_ABV+0.0085,
label = percent(Max_ABV, accuracy = 0.01),
fill = NULL), data = Max_ABV, angle = 90) +
geom_bar(data=Max_IBU, aes(x = State, y = Max_IBU, fill = State), stat = 'identity',
show.legend = FALSE) +
geom_text(aes(State, Max_IBU+5, label = round(Max_IBU,digits = 0), fill = NULL), data = Max_IBU) +
ggtitle('Max ABV and Max IBU Content by State') + facet_grid(Cat~., scale = "free_y",
labeller = label_parsed, switch = "y") + labs(x="State") +
theme(axis.title.y = element_blank(), strip.placement = "outside",axis.title.x = element_text(size=12, face="bold", colour = "black"),
axis.text.x = element_text(size=12, face="bold", colour = "black"),
axis.text.y = element_text(size=12, face="bold", colour = "black"),
strip.text.x = element_text(size = 10, face="bold", colour = "black" ),
strip.text.y = element_text(size = 10, face="bold", colour = "black"),
)
Maxplot_ABV_IBU
#plot best viewed in scaled full screen zoom 1920px x 1080px (21" or higher dim screen)
From the Max ABV and IBU plot it is observed that ‘Colorado’ has the max ABV value of 12.8% and ‘Oregon’ has the max IBU value of 138.
Stats <- Totaldata_imp %>% summarize(Mean = mean(ABV, na.rm=TRUE),
Median = median(ABV,na.rm=T), Max = max(ABV,na.rm=T), Min = min(ABV,na.rm=T),
SD = sd(ABV,na.rm=T), N = n())
#Histogram and Density Plot
His_Den <- Totaldata_imp %>% filter(!is.na(ABV)) %>% ggplot(aes(x=ABV)) +
geom_histogram(aes(y=..density..),colour='red',fill='blue', binwidth = 0.0035) +
geom_density(alpha=.4, fill='#FFFF00') +
ggtitle('ABV Statistical Attributes - Histogram, Density and Box Plots') +
scale_x_continuous(breaks = seq(0,0.14,0.01),labels = seq(0,0.14,0.01)) + labs(y="Density / Count")
#Box plot for ABV data
Box <- Totaldata_imp %>% filter(!is.na(ABV)) %>% ggplot(aes(x=ABV)) +
geom_boxplot(col='black',fill='#FF6666') + scale_x_continuous(expand = c(0,0), limit = c(0,0.14)) + scale_y_continuous(expand = c(0,0), limit = c(-0.4,0.4))
#Histogram density plot and Box plot on same scale
grid.draw(rbind(ggplotGrob(His_Den),
ggplotGrob(Box),
size = "first"))
Stats
## Mean Median Max Min SD N
## 1 0.05976762 0.057 0.128 0.001 0.01337577 2410
All statistically significant attributes for the ABV values are shown above The distribution of ABV is slightly right skewed. ABV values from 4.90% to 5.95% are the most widely used. 6.5% ABV values show second highest peak from the histogram. . There are total 2410 non-missing ABV values in this data set. The maximum ABV is 12.8%, the minimum ABV is .1%. The mean ABV is 5.98%, median 5.7% and standard deviation of ABV is 1.34%.
#Scatter plot with
Totaldata_imp %>% filter(!is.na(ABV) & !is.na(IBU)) %>%
ggplot(aes(y=ABV, x=IBU)) + geom_point(aes(colour = ABV/IBU)) + geom_smooth(method=loess) + ggtitle("ABV V. IBU Plot") + scale_y_continuous(labels = percent)
## `geom_smooth()` using formula 'y ~ x'
The scatter plot indicates there is a moderately positive linear relationship (i.e., as IBU increases ABV increases). Most beers with lower IBU (less than 50) have ABV values around 5%. When IBU value increases, ABV values spreads out. But most beers with IBU values above 50, their ABV values spread out within the region between 5% and 10%.
# ABV/ IBU for IPA
Data_IPA <- Totaldata_imp %>%
filter(str_detect(Style, regex(str_c('\\b','IPA','\\b',sep=''), ignore_case = T)))
#Stats IPA
Stats_IPA_ABV <- Data_IPA %>% summarize(Mean = mean(ABV, na.rm=TRUE),
Median = median(ABV,na.rm=T), Max = max(ABV,na.rm=T), Min = min(ABV,na.rm=T),
SD = sd(ABV,na.rm=T), N = n())
Stats_IPA_IBU <- Data_IPA %>% summarize(Mean = mean(IBU, na.rm=TRUE),
Median = median(IBU,na.rm=T), Max = max(IBU,na.rm=T), Min = min(IBU,na.rm=T),
SD = sd(IBU,na.rm=T), N = n())
#Histogram and Density Plot for IPA with ABV
His_Den_IPA_ABV <- Data_IPA %>% ggplot(aes(x=ABV)) +
geom_histogram(aes(y=..density..),colour='red',fill='blue', binwidth = 0.0035) +
geom_density(alpha=.4, fill='#FFFF00') +
ggtitle('IPA - ABV Statistical Attributes - Histogram, Density') +
scale_x_continuous(breaks = seq(0,0.14,0.01),labels = seq(0,0.14,0.01)) + labs(y="Density / Count")
#Histogram and Density Plot for IPA with IBU
His_Den_IPA_IBU <- Data_IPA %>% ggplot(aes(x=IBU)) +
geom_histogram(aes(y=..density..),colour='red',fill='pink') +
geom_density(alpha=.2, fill='#FFFF00') +
ggtitle('IPA - IBU Statistical Attributes - Histogram, Density') +
scale_x_continuous() + labs(y="Density / Count")
#ABV / IBU for Ale
Data_Ale <- Totaldata_imp %>%
filter(str_detect(Style, regex(str_c('\\b','Ale','\\b',sep=''), ignore_case = T)))
Stats_Ale_ABV <- Data_Ale %>% summarize(Mean = mean(ABV, na.rm=TRUE),
Median = median(ABV,na.rm=T), Max = max(ABV,na.rm=T), Min = min(ABV,na.rm=T),
SD = sd(ABV,na.rm=T), N = n())
Stats_Ale_IBU <- Data_Ale %>% summarize(Mean = mean(IBU, na.rm=TRUE),
Median = median(IBU,na.rm=T), Max = max(IBU,na.rm=T), Min = min(IBU,na.rm=T),
SD = sd(IBU,na.rm=T), N = n())
#Histogram and Density Plot for Ale with ABV
His_Den_Ale_ABV <- Data_Ale %>% ggplot(aes(x=ABV)) +
geom_histogram(aes(y=..density..),colour='red',fill='blue', binwidth = 0.0035) +
geom_density(alpha=.4, fill='#FFFF00') +
ggtitle('Ale - ABV Statistical Attributes - Histogram, Density') +
scale_x_continuous(breaks = seq(0,0.19,0.01),labels = seq(0,0.19,0.01)) + labs(y="Density / Count")
#Histogram and Density Plot for Ale with IBU
His_Den_Ale_IBU <- Data_Ale %>% ggplot(aes(x=IBU)) +
geom_histogram(aes(y=..density..),colour='red',fill='pink') +
geom_density(alpha=.2, fill='#FFFF00') +
ggtitle('Ale - IBU Statistical Attributes - Histogram, Density') +
scale_x_continuous() + labs(y="Density / Count")
#Fitting all the statistical data in one screen
require(gridExtra)
## Loading required package: gridExtra
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
a <- grid.arrange(His_Den_IPA_ABV,His_Den_IPA_IBU,His_Den_Ale_ABV,His_Den_Ale_IBU,ncol=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Stats_IPA_ABV
## Mean Median Max Min SD N
## 1 0.06860334 0.068 0.099 0.027 0.01234518 572
Stats_IPA_IBU
## Mean Median Max Min SD N
## 1 65.78195 65 138 0.4818612 20.66601 572
Stats_Ale_ABV
## Mean Median Max Min SD N
## 1 0.0569628 0.055 0.099 0.035 0.0110365 978
Stats_Ale_IBU
## Mean Median Max Min SD N
## 1 36.67542 35 115 4 16.64074 978
The above plots and values show IPA’s have higher ABV and IBU values than the Ale’s. This indicates that IPA’s are more alcoholic and more bitter than the Ale’s in general.
#ABV_IBU with respect to IPAs and Ale’s using kNN
#Checking to see number of rows in each IPA and Ale data
nrow(Data_IPA)
## [1] 572
nrow(Data_Ale)
## [1] 978
Data_IPA$Style1 <- "IPA" #Adding Style1 column for IPA style beer as IPA
Data_Ale$Style1 <- "Ale" #Adding Style1 column for Ale style beer as Ale
Data_IPA$Style1 <- as.factor(Data_IPA$Style1)
Data_Ale$Style1 <- as.factor(Data_Ale$Style1)
#Combining the IPA and Ale data to create a training set
Data_Ale_IPA <- rbind(Data_IPA, Data_Ale)
#Plot of all IPA and Ale data
Data_Ale_IPA %>% ggplot(aes(x=IBU, y=ABV)) + geom_point(aes(colour=Style1)) +
theme(axis.title.x = element_text(size=12, face="bold", colour = "black"),
axis.text.x = element_text(size=12, face="bold", colour = "black"),
axis.title.y = element_text(size=12, face="bold", colour = "black"),
axis.text.y = element_text(size=12, face="bold", colour = "black"),
strip.text.x = element_text(size = 10, face="bold", colour = "black" ),
strip.text.y = element_text(size = 10, face="bold", colour = "black"),
) + ggtitle("ABV v. IBU based on Style") + labs(colour = "Style")
# kNN approach for 500 dataset iterations and 1-30 k values
iterations = 500 # number of iterations to test the k value
numks = 30 # number of k used in the iterations
splitPerc = .7 # split percentage assumed from the total dataset
masterAcc = matrix(nrow = iterations, ncol = numks)
set.seed(6)
trainIndices = sample(1:dim(Data_Ale_IPA)[1],round(splitPerc * dim(Data_Ale_IPA)[1]))
beer_train = Data_Ale_IPA[trainIndices,]
beer_test = Data_Ale_IPA[-trainIndices,]
#columns 4 and 5 represent the ABV and IBU values
classifications = knn(beer_train[,c(4,5)],beer_test[,c(4,5)],beer_train$Style1, prob = TRUE, k = 5)
CM = confusionMatrix(table(classifications,beer_test$Style1))
classifications
## [1] IPA IPA IPA IPA Ale Ale IPA IPA IPA IPA Ale Ale IPA IPA IPA IPA IPA Ale
## [19] IPA IPA Ale IPA IPA IPA IPA IPA IPA IPA Ale IPA Ale IPA Ale IPA IPA IPA
## [37] IPA IPA Ale IPA IPA IPA IPA IPA Ale Ale IPA IPA IPA IPA IPA IPA IPA IPA
## [55] IPA IPA IPA IPA IPA IPA IPA IPA IPA Ale IPA Ale IPA IPA Ale IPA Ale IPA
## [73] IPA IPA Ale IPA IPA IPA IPA Ale Ale IPA IPA IPA Ale IPA IPA IPA IPA IPA
## [91] Ale Ale IPA Ale Ale IPA Ale Ale Ale IPA IPA IPA Ale Ale Ale IPA Ale Ale
## [109] IPA IPA Ale IPA Ale Ale Ale IPA IPA IPA IPA IPA IPA IPA IPA IPA Ale Ale
## [127] Ale IPA IPA IPA Ale IPA IPA Ale IPA IPA IPA Ale Ale IPA Ale IPA Ale Ale
## [145] Ale Ale Ale IPA IPA Ale Ale IPA IPA IPA IPA Ale Ale Ale Ale Ale Ale Ale
## [163] Ale Ale Ale IPA IPA Ale IPA Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale
## [181] Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale IPA Ale Ale
## [199] Ale Ale IPA Ale Ale IPA Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale
## [217] Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale Ale Ale IPA
## [235] Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale
## [253] Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale Ale Ale Ale Ale
## [271] Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale IPA Ale Ale IPA
## [289] Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale
## [307] Ale IPA Ale Ale Ale IPA Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale
## [325] Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale
## [343] IPA Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale
## [361] Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale
## [379] Ale Ale Ale IPA Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale IPA IPA IPA
## [397] Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale
## [415] Ale Ale IPA Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale
## [433] Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale Ale Ale Ale
## [451] Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale Ale
## attr(,"prob")
## [1] 0.6000000 1.0000000 0.6000000 0.6000000 0.8000000 0.6000000 0.8750000
## [8] 0.8888889 1.0000000 0.6000000 0.8235294 0.6000000 1.0000000 1.0000000
## [15] 1.0000000 0.5000000 0.6000000 0.6000000 1.0000000 1.0000000 0.8333333
## [22] 0.8000000 0.8888889 1.0000000 1.0000000 0.8000000 0.6000000 1.0000000
## [29] 0.6000000 0.6000000 0.6000000 0.6000000 1.0000000 0.6666667 0.8000000
## [36] 1.0000000 1.0000000 1.0000000 0.8333333 0.8000000 0.6000000 0.8000000
## [43] 0.8000000 1.0000000 0.6000000 0.6000000 1.0000000 1.0000000 1.0000000
## [50] 1.0000000 1.0000000 1.0000000 1.0000000 0.8571429 0.8333333 1.0000000
## [57] 0.8000000 0.8000000 0.8000000 1.0000000 0.8000000 1.0000000 0.7500000
## [64] 0.8000000 0.8000000 0.8000000 0.8000000 1.0000000 0.8000000 0.7777778
## [71] 0.6000000 0.6000000 0.6000000 1.0000000 1.0000000 1.0000000 0.6000000
## [78] 0.6000000 1.0000000 1.0000000 0.6000000 0.6000000 0.8000000 1.0000000
## [85] 0.6000000 0.6000000 0.8000000 0.8571429 0.8571429 1.0000000 0.8000000
## [92] 1.0000000 1.0000000 0.6666667 0.7500000 1.0000000 0.6666667 0.6666667
## [99] 0.6666667 0.6000000 0.8571429 0.8333333 0.8235294 0.6000000 1.0000000
## [106] 0.8571429 0.8000000 0.8000000 0.8000000 0.8000000 0.6000000 0.8000000
## [113] 0.6000000 0.8333333 0.8235294 0.6666667 1.0000000 1.0000000 1.0000000
## [120] 1.0000000 0.9090909 0.8888889 1.0000000 1.0000000 1.0000000 0.7000000
## [127] 0.6000000 1.0000000 0.8888889 0.6000000 0.6000000 0.8000000 0.9090909
## [134] 0.8000000 1.0000000 1.0000000 0.8000000 0.8000000 0.6000000 0.8000000
## [141] 0.6000000 0.7142857 0.6000000 0.6000000 0.6000000 0.6000000 0.6000000
## [148] 1.0000000 1.0000000 0.6000000 0.8235294 0.8000000 0.7142857 0.8333333
## [155] 1.0000000 0.6000000 1.0000000 0.8000000 0.6000000 0.8000000 0.6000000
## [162] 1.0000000 1.0000000 1.0000000 0.6000000 0.7777778 0.6666667 0.8750000
## [169] 0.6000000 1.0000000 0.8000000 0.8000000 1.0000000 1.0000000 1.0000000
## [176] 0.6666667 0.8000000 0.8000000 0.6000000 0.6666667 1.0000000 0.8000000
## [183] 0.6666667 1.0000000 1.0000000 0.6000000 0.8000000 0.6000000 1.0000000
## [190] 1.0000000 1.0000000 0.8000000 0.6000000 1.0000000 1.0000000 1.0000000
## [197] 0.6000000 0.6000000 1.0000000 0.6000000 0.6000000 1.0000000 1.0000000
## [204] 0.8571429 1.0000000 0.6000000 1.0000000 1.0000000 0.8333333 0.6000000
## [211] 1.0000000 0.6000000 1.0000000 1.0000000 0.8000000 0.6000000 0.6000000
## [218] 1.0000000 0.6666667 0.7000000 1.0000000 1.0000000 1.0000000 1.0000000
## [225] 0.6250000 0.8333333 1.0000000 0.6666667 1.0000000 0.6000000 0.6000000
## [232] 1.0000000 0.6000000 0.6666667 0.6000000 1.0000000 0.6666667 1.0000000
## [239] 1.0000000 1.0000000 0.7500000 0.6250000 0.6250000 1.0000000 0.8000000
## [246] 1.0000000 0.6666667 1.0000000 0.8235294 0.8000000 0.8000000 1.0000000
## [253] 0.8333333 0.6666667 1.0000000 0.8000000 1.0000000 1.0000000 1.0000000
## [260] 1.0000000 0.8000000 1.0000000 1.0000000 0.6000000 1.0000000 1.0000000
## [267] 0.8000000 0.6000000 1.0000000 0.8000000 1.0000000 0.8000000 0.8000000
## [274] 0.8000000 0.6000000 1.0000000 0.8333333 1.0000000 0.8571429 0.6666667
## [281] 1.0000000 1.0000000 1.0000000 0.8235294 0.8000000 0.8000000 1.0000000
## [288] 0.6000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [295] 1.0000000 1.0000000 1.0000000 0.8000000 0.8000000 1.0000000 1.0000000
## [302] 0.8333333 1.0000000 1.0000000 1.0000000 0.8000000 0.6666667 0.8000000
## [309] 0.8000000 1.0000000 0.6000000 1.0000000 1.0000000 1.0000000 1.0000000
## [316] 0.5714286 1.0000000 1.0000000 0.8000000 1.0000000 1.0000000 0.8000000
## [323] 0.6000000 1.0000000 1.0000000 1.0000000 1.0000000 0.8000000 1.0000000
## [330] 1.0000000 0.6000000 0.8000000 1.0000000 0.6666667 0.6000000 1.0000000
## [337] 1.0000000 0.6000000 1.0000000 1.0000000 1.0000000 1.0000000 0.8333333
## [344] 1.0000000 0.6000000 1.0000000 1.0000000 0.8000000 1.0000000 1.0000000
## [351] 0.8571429 1.0000000 1.0000000 0.8000000 1.0000000 1.0000000 0.6666667
## [358] 1.0000000 0.8000000 1.0000000 0.8000000 1.0000000 1.0000000 0.8333333
## [365] 1.0000000 0.6000000 1.0000000 0.8000000 1.0000000 0.8333333 1.0000000
## [372] 0.8333333 1.0000000 1.0000000 0.8333333 1.0000000 1.0000000 1.0000000
## [379] 0.8000000 0.6000000 0.8000000 0.8000000 0.5555556 0.8000000 1.0000000
## [386] 1.0000000 0.8000000 1.0000000 0.6000000 1.0000000 0.6000000 1.0000000
## [393] 1.0000000 0.8000000 0.6000000 0.8000000 0.8000000 1.0000000 0.5555556
## [400] 1.0000000 0.8000000 0.6000000 1.0000000 1.0000000 0.8000000 0.8000000
## [407] 0.8000000 1.0000000 0.6666667 1.0000000 0.8000000 0.8000000 0.8000000
## [414] 0.8000000 1.0000000 1.0000000 0.8333333 1.0000000 1.0000000 1.0000000
## [421] 0.8000000 0.6000000 1.0000000 0.6666667 0.6000000 0.6000000 1.0000000
## [428] 1.0000000 0.6000000 1.0000000 0.6666667 1.0000000 1.0000000 1.0000000
## [435] 1.0000000 0.8000000 1.0000000 1.0000000 0.6000000 1.0000000 1.0000000
## [442] 0.6000000 1.0000000 1.0000000 1.0000000 1.0000000 0.8000000 0.8235294
## [449] 0.8750000 1.0000000 0.8000000 1.0000000 1.0000000 1.0000000 1.0000000
## [456] 0.6000000 1.0000000 0.8000000 0.8000000 1.0000000 0.8000000 0.7142857
## [463] 1.0000000 1.0000000 0.8000000
## Levels: IPA Ale
CM
## Confusion Matrix and Statistics
##
##
## classifications IPA Ale
## IPA 104 33
## Ale 52 276
##
## Accuracy : 0.8172
## 95% CI : (0.779, 0.8513)
## No Information Rate : 0.6645
## P-Value [Acc > NIR] : 1.624e-13
##
## Kappa : 0.5773
##
## Mcnemar's Test P-Value : 0.05089
##
## Sensitivity : 0.6667
## Specificity : 0.8932
## Pos Pred Value : 0.7591
## Neg Pred Value : 0.8415
## Prevalence : 0.3355
## Detection Rate : 0.2237
## Detection Prevalence : 0.2946
## Balanced Accuracy : 0.7799
##
## 'Positive' Class : IPA
##
#Above Confusion Matrix and Statistics show a accuracy of ~80%
for(j in 1:iterations)
{
accs = data.frame(accuracy = numeric(30), k = numeric(30))
trainIndices = sample(1:dim(Data_Ale_IPA)[1],round(splitPerc * dim(Data_Ale_IPA)[1]))
beer_train = Data_Ale_IPA[trainIndices,]
beer_test = Data_Ale_IPA[-trainIndices,]
for(i in 1:numks)
{
classifications = knn(beer_train[,c(4,5)],beer_test[,c(4,5)],beer_train$Style1, prob = TRUE, k = i)
table(classifications,beer_test$Style1)
CM = confusionMatrix(table(classifications,beer_test$Style1))
masterAcc[j,i] = CM$overall[1]
}
}
MeanAcc = colMeans(masterAcc)
p = ggplot(mapping = aes(x = seq(1,numks,1), y = MeanAcc)) + geom_line() + ggtitle("Mean Accuracy v. Number of k") + xlab('k values')
ggplotly(p)
Considering the IPA and Ale data using 70%/30% split and shuffling test and training data 500 times, with K assigned from 1 to 30 during each shuffling, we can achieve the highest mean accuracy 80.0% when K=5.
iterations = 500
masterAcc = matrix(nrow = iterations)
masterSen = matrix(nrow = iterations)
masterSpec = matrix(nrow = iterations)
splitPerc = .7
for(j in 1:iterations)
{
trainIndices = sample(1:dim(Data_Ale_IPA)[1],round(splitPerc * dim(Data_Ale_IPA)[1]))
beer_train = Data_Ale_IPA[trainIndices,]
beer_test = Data_Ale_IPA[-trainIndices,]
model = naiveBayes(beer_train[,c(4,5)],as.factor(beer_train$Style1),laplace = 1)
table(predict(model,beer_test[,c(4,5)]),as.factor(beer_test$Style1))
CM = confusionMatrix(table(predict(model,beer_test[,c(4,5)]),as.factor(beer_test$Style1)))
masterAcc[j] = CM$overall[1]
masterSen[j] = CM$byClass[1]
masterSpec[j] = CM$byClass[2]
}
MeanAcc = colMeans(masterAcc)
MeanSen = colMeans(masterSen)
MeanSpec = colMeans(masterSpec)
MeanAcc
## [1] 0.7944387
MeanSen
## [1] 0.6708932
MeanSpec
## [1] 0.8668096
Naive Bayes model run for 500 iterations and 70/30% split gave the Mean accuracy (~79%), Mean sensitivity (~67%) and Mean specificity (~86%) as shown from the values above. The data is close match with kNN model.
Please go through the presentation final conclusion for the entire data.